suppressPackageStartupMessages(library(tidyverse))
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities

Settings

theme_set(
  theme_classic(base_size = 7) +
    theme(
      legend.position = 'bottom'
    )
)

wd <- '~/Google Drive/My Drive/Analysis/METTL2A/'

paste_wd <- function(path) {
  paste0(wd, path)
}

figdir   <- 'Figures/DRS/PolyA/' |> paste_wd()
tabledir <- 'Tables/DRS/PolyA/'  |> paste_wd()

Read data

chrM_polyA_length_data <- 
  read_tsv('Tables/DRS/PolyA/chrM_polyA_length_data_2025-07-11.tsv.gz' |> paste_wd())
## Rows: 2072277 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (12): readname, transcript_id, qc_tag, sample_num, type, rep, transcript...
## dbl  (8): position, leader_start, adapter_start, polya_start, transcript_sta...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
chrM_polyA_length_data
## # A tibble: 2,072,277 × 20
##    readname        transcript_id position leader_start adapter_start polya_start
##    <chr>           <chr>            <dbl>        <dbl>         <dbl>       <dbl>
##  1 e2291977-a67e-… ENST00000389…        0            2             3        9363
##  2 3da7d287-48ee-… ENST00000389…        0            2             3       12065
##  3 daa33f45-6c8e-… ENST00000389…        0            2             3       12664
##  4 8d12dc33-6bb7-… ENST00000389…        0            2             3        7793
##  5 ed309b03-0167-… ENST00000387…        0           -1            -1          -1
##  6 498c7f2b-44b3-… ENST00000387…        0          540          2462        5598
##  7 f0e4f4ca-c04c-… ENST00000389…        0            2             3        7919
##  8 5a38a52b-d271-… ENST00000387…        0           -1            -1          -1
##  9 7e677234-de8e-… ENST00000389…        0          265           611        6969
## 10 25aba1c2-5e65-… ENST00000389…        0            2             3        5990
## # ℹ 2,072,267 more rows
## # ℹ 14 more variables: transcript_start <dbl>, read_rate <dbl>,
## #   polya_length <dbl>, qc_tag <chr>, sample_num <chr>, type <chr>, si <dbl>,
## #   rep <chr>, transcript_type <chr>, transcript_name <chr>, gene_id <chr>,
## #   gene_type <chr>, gene_name <chr>, seqname <chr>
unique(chrM_polyA_length_data$transcript_type)
## [1] "Mt_rRNA"        "Mt_tRNA"        "protein_coding" NA

Boxplot

chrM_polyA_length_data_boxplot <- 
  chrM_polyA_length_data |> 
  filter(polya_length > 0) |> 
  filter(transcript_type %in% c('Mt_rRNA', 'protein_coding')) |> 
  #filter()
  ggplot(aes(
    x = reorder(transcript_name, polya_length, FUN = median),
    y = polya_length, 
    #color = transcript_type,
    fill = transcript_type
  )) +
  #geom_violin() +
  geom_boxplot(outliers = FALSE) +
  geom_hline(yintercept = c(0), alpha = 1/2) +
  scale_fill_manual(values = c('#DE555A', '#DE559F')) +
  scale_y_continuous(breaks = seq(0, 120, 20)) +
  labs(x = '', y = 'poly(A) length (nt)') +
  coord_flip()
chrM_polyA_length_data_boxplot |> 
  ggsave_pdf(outdir = figdir, width = 6, height = 7)